---
title: "Cleaning"
output: html_document
---

```{r}
# set r markdown chunk defaults
knitr::opts_chunk$set(warning = F, message = F)
```

# Description

This file imports the raw dataset from YouGov and exports a cleaned datafile that is used in the analysis scripts. 

# Setup

```{r}

# clearing memory --------------------------------------------------------------
rm(list = ls())

# load packages ----------------------------------------------------------------
library(here)
library(tidyverse)
library(survey)
library(srvyr)
library(ggeffects)
library(stargazer)
library(broom)
library(ggstance)
library(ggpubr)
library(arm)
library(gridExtra)
library(doParallel)
library(foreach)
library(broom)
library(margins)
library(patchwork)
library(relimp, pos = 4)
library(paran)
library(clarify)
library(viridis)
library(stringr)
library(readxl)

# useful functions -------------------------------------------------------------

# function to calculate standard error of mean
std_error <- function(var){
  sd(var, na.rm = T) / sqrt(length(var[!is.na(var)]))
}

# function to manually bin continous variables
bin_fun <- function(x, bin_size){
  bin_size * ceiling(x / bin_size) - bin_size / 2
}

# function to put variables on a 0-1 scale
rescale_01 <- function(x, max){
  (x-1)/(max-1)
}

# set n_sim
n_sim <- 1000

# set seed
set.seed(4787)

# plotting colors (colorblind palette)
cb_black <- "#000000" # Discernment / all headlines
cb_blue <- "#0072B2"  # Democratic headlines
cb_red <- "#D55E00"    # Republican headlines
cb_green <- "#009E73"  # True headlines
cb_orange <- "#E69F00" # False headlines

# load YouGov survey data
yougov <- read.csv(here("data", "yougov","roadsafety_US_final2.csv"))

# load mturk ratings data
mturk_0 <- readxl::read_excel(here("data", "pretest", "Pretest6 results_bg_edited.xlsx"), sheet = "all")


```

# Data Manipulation


## Clean Pretest Data

```{r}

# yougov's key that matches different headline IDs together
request_0 <- read.csv(here("data", "pretest", "guay_data_request_2024.csv"))

request <- 
  request_0 %>% 
  mutate(hdl_text_nospaces = str_remove_all(file_name, " ")) %>%
  dplyr::select(hdl_id = varname, hdl_text_nospaces) %>%
  filter(!str_detect(hdl_id, "COVID"))

mturk_1 <- 
  mturk_0 %>%
  mutate(hdl_text_nospaces = str_remove_all(Headline, " "))

mturk <- merge(mturk_1, request, by = "hdl_text_nospaces")

# remove pre-test data for the headline not used in survey 
mturk <- mturk[mturk$hdl_id != "REALREP8",]

# rename columns

# create mean dem/rep scores for pre-test ratings
mturk$partisan_all_rater <- (mturk$partisan_dem_rater + mturk$partisan_rep_rater)/2
mturk$true_all = (mturk$true_dem + mturk$true_rep)/2

mturk <- 
  mturk %>%
  rename(rating_all_betterForReps = partisan_all_rater, 
         rating_dem_betterForReps = partisan_dem_rater, 
         rating_rep_betterForReps = partisan_rep_rater, 
         rating_all_likelyTrue = true_all, 
         rating_dem_likelyTrue = true_dem,
         rating_rep_likelyTrue = true_rep)

```

## Clean YouGov Data

```{r}

# experimental condition and headline type variables ===========================

# intervention type ------------------------------------------------------------
yougov$cond_int <- with(yougov, 
                        ifelse(adtest_split == 1, "accuracy", 
                               ifelse(adtest_split == 2, "psa", 
                                      ifelse(adtest_split == 3, "headline", 
                                             ifelse(adtest_split == 4, "control", "other")))))
yougov$cond_int[yougov$cond_int == "headline" & yougov$headfeedback_split == 1] <- "headlineFeedback"
table(yougov$cond_int, yougov$headfeedback_split, useNA = "always")
table(yougov$cond_int)

# headline type ----------------------------------------------------------------
yougov$cond_hdl <- with(yougov, 
                        ifelse(covpol_split == 1, "political",
                               ifelse(covpol_split == 2, "covid", "other")))
table(yougov$covpol_split, yougov$cond_hdl)

# restrict data to political headlines
yougov <-
  yougov %>%
  filter(cond_hdl == "political")
dim(yougov)

# feedback type (intervention: headline condition only) -------------------------
yougov$cond_feedback <- with(yougov, 
                             ifelse(headfeedback_split == 1, "feedback", 
                                    ifelse(headfeedback_split == 2, "noFeedback", 
                                           "other")))
table(yougov$cond_feedback, yougov$headfeedback_split)

# headline shown (intervention: headline condition only) -----------------------
# where 1 = snowden, 2 = microwave, 3 = seinfeld, 4 = womanCharged, 5 = Youtuber
yougov$pooledTreat <- ifelse(yougov$cond_int == "control", 0, 1)
table(yougov$pooledTreat, yougov$cond_int)

# dependent variables ==========================================================

share_vars_fakedem <- paste("FAKEDEM", 1:15, sep = "")
share_vars_fakerep <- paste("FAKEREP", 1:15, sep = "")

share_vars_realdem <- paste("REALDEM", 1:15, sep = "")
share_vars_realrep <- paste("REALREP", c(1:7, 9:15), sep = "")

share_vars_all <- c(share_vars_fakedem, share_vars_fakerep, share_vars_realdem, share_vars_realrep)

# recode demographic & political covariates ====================================

# party ------------------------------------------------------------------------
yougov$party <- with(yougov, 
                     ifelse(pid7_with_leaners == 1, "dem",
                            ifelse(pid7_with_leaners == 2, "rep",
                                   ifelse(pid7_with_leaners == 3, "ind", NA))))

yougov$party_rep <- ifelse(yougov$party == "dem", 0, 
                           ifelse(yougov$party == "rep", 1, NA))

yougov$party_dem <- ifelse(yougov$party == "dem", 1, 
                           ifelse(yougov$party == "rep", 0, NA))

yougov$pid7[yougov$pid7 == 8] <- NA

table(yougov$pid7, yougov$party_rep, useNA = "always")

# ideology ---------------------------------------------------------------------
yougov$ideology <- ifelse(yougov$ideo5 %in% 4:5, "conservative", 
                          ifelse(yougov$ideo5 %in% 1:2, "liberal", NA))

yougov$ideo5[yougov$ideo5 == 6] <- 4

# need for chaos ---------------------------------------------------------------
chaos_vars <- paste("chaos", 1:8, sep = "_")
apply(yougov[,chaos_vars], 2, table, useNA = "always")

yougov$chaos_mean_01 <- rescale_01(rowMeans(yougov[,chaos_vars], na.rm=T), max = 7)

yougov <- 
  yougov %>%
  mutate(chaos_binned =  bin_fun(chaos_mean_01, bin_size = .10))

# political interest -----------------------------------------------------------
table(yougov$newsint, useNA = "always")
yougov$newsint[yougov$newsint == 7] <- NA
table(yougov$newsint, useNA = "always")
yougov$political_interest_01 <- rescale_01(5-yougov$newsint, max = 4)
table(yougov$newsint, yougov$political_interest, useNA = "always")

# gender -----------------------------------------------------------------------
yougov$female <- ifelse(yougov$gender == 2, 1, 
                        ifelse(yougov$gender == 1, 0, NA))
table(yougov$female, yougov$gender, useNA = "always")

# education --------------------------------------------------------------------
table(yougov$educ4, useNA = "always")
yougov$education <- yougov$educ4

# age --------------------------------------------------------------------------
yougov$age <- yougov$age5

# age categories:
#18-29 = 1
#30-44 = 2
#45-54 = 3
#55-64 = 4
#65+= 5

# create vector of demographic variable names
dem_vars <- c("age", "female", "education")

# other variables -------------------------------------------------------------

yougov$share_accurate_01 <- rescale_01(yougov$share_accurate, max = 5)
yougov$age_sd <- as.numeric(scale(yougov$age))
yougov$education_01 <- rescale_01(yougov$education, max = 4)
yougov$news_is_biased_01 <- yougov$biased_news - 1
yougov$trust_news_soc_01 <- rescale_01(yougov$trust_news_soc, max = 5)
yougov$fb_use_01 <- rescale_01(yougov$fb_use, max = 5)
yougov$faminc_new[yougov$faminc_new == 97] <- NA
yougov$faminc_new[yougov$faminc_new >= 13] <- 13
yougov$faminc_new_sd <- as.numeric(scale(yougov$faminc_new))

```

## Reshape YouGov Data

Reshape from wide to long (one row for each sharing intention, multiple rows per respondent). 

```{r}

# create standardized versions of variables
yougov <- 
  yougov %>%
  mutate(share_accurate_sd = as.numeric(scale(share_accurate_01)), 
         chaos_mean_sd = as.numeric(scale(chaos_mean_01)), 
         political_interest_sd = as.numeric(scale(political_interest_01)),
         education_sd = as.numeric(scale(education_01)), 
         news_is_biased_sd = as.numeric(scale(news_is_biased_01)), 
         trust_news_soc_sd = as.numeric(scale(trust_news_soc_01)), 
         fb_use_sd = as.numeric(scale(fb_use_01)))

# create vector of independent variable names
iv_vars <- c("party_rep", "party_dem", "share_accurate_sd","chaos_mean_sd", "political_interest_sd", 
             "age_sd", "education_sd", "news_is_biased_sd", "trust_news_soc_sd", 
             "fb_use_sd", "faminc_new_sd")

# reshape data -----------------------------------------------------------------
long <- 
  tibble(yougov) %>%
  
  # select relevant columns
  dplyr::select(caseid = id, weight,
                party, party_rep, party_dem, pid7, ideology, ideo5,
                cond_int, pooledTreat, cond_hdl, cond_feedback, 
                all_of(share_vars_all), 
                all_of(dem_vars), 
                all_of(iv_vars)) %>%
  
  # reshape data
  gather(hdl_id, share, 
         -caseid, -weight, 
         -cond_int, -pooledTreat, -cond_hdl, -cond_feedback, 
         -party, -party_rep, -party_dem, -pid7, -ideology, -ideo5, 
         -all_of(iv_vars),
         -all_of(dem_vars))  %>%
  
  # create variables indicating characteristics of headlines
  mutate(hdl_fakeReal = 
           factor(ifelse(str_detect(hdl_id, "REAL"), "Real",
                         ifelse(str_detect(hdl_id, "FAKE"), "Fake", NA)), 
                  levels = c("Fake", "Real"))) %>%
  mutate(hdl_demRep = 
           factor(ifelse(str_detect(hdl_id, "DEM"), "dem",
                         ifelse(str_detect(hdl_id, "REP"), "rep", NA)), 
                  levels = c("dem", "rep"))) %>%
  
  # merge in mturk ratings
  left_join(mturk, by = "hdl_id") %>%
  
  # remove rows that have NA for outcome ("share")
  filter(!is.na(share)) %>%
  
  # create standardized version of share DV variable
  mutate(share_sd = as.numeric(scale(share))) %>%
  
  # reorder sharing condition levels
  mutate(cond_int = factor(cond_int,
                           levels = c("control",
                                      "accuracy",
                                      "psa", 
                                      "headlineFeedback",
                                      "headline"))) %>%
  
  # reorder condition levels
  mutate(cond_feedback = factor(cond_feedback)) %>%
  
  #create binary concordance variable
  mutate(hdl_agreeable = 
           ifelse(party == "ind", NA, ifelse(party == hdl_demRep, 1, 0))) %>%
  
  # put share variable on 0-1 scale
  mutate(share_01 = rescale_01(share, max = 6))

# additional edits to data
long <- 
  long %>%
  mutate(hdl_true = ifelse(hdl_fakeReal == "Fake", 0, 1))

```

# Export Cleaned Data

```{r}

# export cleaned data -------------------------------------------
write.csv(long, file = here("data", "cleaned", "long.csv"))
write.csv(yougov, file = here("data", "cleaned", "yougov.csv"))

```
